Сonvert data to csv format to read it easily with R and remove unuseful columns from it
data <- read.csv("../data/IAD30284_384WellPlateDataSheet.csv", header = TRUE, skip = 14)
data <- data[data$Amplicon_ID != "Blank", ]
data$X384Well_Row <- NULL
data$X384Well_Col <- NULL
data$Gene_Symbol <- NULL
data$Genome_Version <- NULL
data$LineItem_Type <- NULL
Get amplicon sequence that lies between primers
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:utils':
##
## View
library(httr)
n <- nrow(data)
chromosomes <- data$Chr
beginings <- data$Insert_Start
endings <- data$Insert_Stop
ampliconInnerPart <- rep("", n)
for (i in 1:n){
call <- paste(c("http://178.79.134.178/refservice/references/GRCh37.p13/sequence?contig=", as.character(chromosomes[i]),
"&start=",as.character(beginings[i]),"&end=",as.character(endings[i])), collapse = "")
call
jsonData <- fromJSON(call)
ampliconInnerPart[i] <- jsonData$sequence
}
head(ampliconInnerPart)
## [1] "TGTTTTCTTTGATCTTACAGTTGTTATTAATTGTGATTGGAGCTATAGCAGTTGTCGCAGTTTTACAACCCTACATCTTTGTTGCAACAGTGCCAGTGATAGTGGCTTTTATTATGTTGAGAGCATATTTCCTCCAAACCTCACAGCAACTCAAACAACTGGAATCTGAAG"
## [2] "AATGCATTAATGCTATTCTGATTCTATAATATGTTTTTGCTCTCTTTTATAAATAGGATTTCTTACAAAAGCAAGAATATAAGACATTGGAATATAACTTAACGACTACAGAAGTAGTGATGGA"
## [3] "ATGGGGCTAATCTGGGAGTTGTTACAGGCGTCTGCCTTCTGTGGACTTGGTTTCCTGATAGTCCTTGCCCTTTTTCAGGCTGGGCTAGGGAGAATGATGATGAAGTACAGGTAGCAACCTATTTTCATAACTTGAAAGTTTTAAAAATTATGTTTTCAAAAAGCCCAC"
## [4] "ACATAAAACAAGCATCTATTGAAAATATCTGACAAACTCATCTTTTATTTTTGATGTGTGTGTGTGTGTGTGTGTGTTTTTTTAACAGGGATTTGGGGAATTATTTGAGAAAGCAAAACAAAACAATAACAATAGAAAAACTTCTAATGGTGATGACAGCCTCTTCTTCAGT"
## [5] "CAGGCAAGGTAGTTCTTTTGTTCTTCACTATTAAGAACTTAATTTGGTGTCCATGTCTCTTTTTTTTTCTAGTTTGTAGTGCTGGAAGGTATTTTTGGAGAAATTCTTAC"
## [6] "AGTTCATTGACATGCCAACAGAAGGTAAACCTACCAAGTCAACCAAACCATACAAGAATGGCCAACTCTCGAAAGTTATGATTA"
data$Amplicon_Inner_Part <- ampliconInnerPart
Get dangling ends
danglingEndStart <- rep("", n)
danglingEndStop <- rep("", n)
for (i in 1:n){
call <- paste(c("http://178.79.134.178/refservice/references/GRCh37.p13/sequence?contig=", as.character(chromosomes[i]),
"&start=",as.character(beginings[i]-1),"&end=",as.character(beginings[i]-1)), collapse = "")
jsonData <- fromJSON(call)
danglingEndStart[i] <- jsonData$sequence
call <- paste(c("http://178.79.134.178/refservice/references/GRCh37.p13/sequence?contig=", as.character(chromosomes[i]),
"&start=",as.character(endings[i]+1),"&end=",as.character(endings[i]+1)), collapse = "")
jsonData <- fromJSON(call)
danglingEndStop[i] <- jsonData$sequence
}
data$Start_Dangling_End <- danglingEndStart
data$Stop_Dangling_End <- danglingEndStop
Load results and get a proportion for each
readResultFile <- function(name, addName = FALSE, delim = "\t"){
filePath <- paste(c("../data/", name), collapse = "")
result <- read.table(filePath, sep = delim, header = TRUE)
result$Gene <- NULL
result$sam <- NULL
result$X <- NULL
sums <- colSums(result[,-1])
proportion_result <- sweep(result[,-1], 2, sums, '/' )
#proportion_result <- sapply(proportion_result, function(x) log(x))
proportion_result$Means <- rowMeans(proportion_result)
proportion_result$Target <- result$Target
y <- data.frame(Target = character(127))
y$Target <- result$Target
ylog <- data.frame(Target = character(127))
ylog$Target <- result$Target
if (addName){
logresultStr <- paste(c(name,"_logresult"), collapse = "")
logMeansStr <- paste(c(name,"_logMeans"), collapse = "")
resultStr <- paste(c(name,"_result"), collapse = "")
MeansStr <- paste(c(name,"_Means"), collapse = "")
}else{
logresultStr <- "logresult"
logMeansStr <- "logMeans"
resultStr <- "result"
MeansStr <- "Means"
}
#ylog[resultStr] <- log(proportion_result[,1])
ylog[MeansStr] <- log(proportion_result$Means)
#y[resultStr] <- proportion_result[,1]
y[MeansStr] <- proportion_result$Means
ylog[ylog[MeansStr] == -Inf,][MeansStr] <- -10 # min(y[ylog$Means != -Inf,]$logMeans) - 20
#ylog[ylog[resultStr] == -Inf,][resultStr] <- -10 # min(y[ylog$result != -Inf,]$logresult) - 20
rownames(y) <- y$Target
rownames(ylog) <- ylog$Target
proportion <- proportion_result
proportion$Means <- NULL
proportion$Target <- NULL
rownames(proportion) <- proportion$Target
return(list(y = y, ylog = ylog, proportions = proportion))
}
y <- readResultFile("result.csv",FALSE, ",")$y
ylog <- readResultFile("result.csv",FALSE, ",")$ylog
#y <- readResultFile("SN1-21_Run_17.xls",FALSE)$y
#ylog <- readResultFile("SN1-21_Run_17.xls",FALSE)$ylog
resultFileNames <- c("SN1-17_Run_14.xls", "SN1-21_Run_17.xls","SN1-23-Run_18.xls","SN1-25-Run_19.xls", "SN1-26-Run_20.xls")
totalTable <- data.frame(Target = character(127))
totalTableOfMeans <- data.frame(Target = character(127))
totalTableOflogMeans <- data.frame(Target = character(127))
for (i in 1:length(resultFileNames)){
res <- readResultFile(resultFileNames[i], TRUE)
namesOfColumns <- names(res$proportions)
totalTable[namesOfColumns] <- res$proportions[namesOfColumns]
namesOfColumns <- names(res$y)
totalTableOfMeans[namesOfColumns] <- res$y
totalTableOflogMeans[namesOfColumns] <- res$ylog
}
totalTable$Target <- y$Target
totalTable$sd <- apply(totalTable[, -1], 1, sd)
totalTable$mean <- apply(totalTable[, -1], 1, mean)
totalTableOfMeans$sd <- apply(totalTableOfMeans[, -1], 1, sd)
totalTableOflogMeans$sd <- apply(totalTableOflogMeans[, -1], 1, sd)
rownames(totalTable) <- totalTable$Target
rownames(totalTableOfMeans) <- totalTableOfMeans$Target
rownames(totalTableOflogMeans) <- totalTableOflogMeans$Target
Prepare features
prepared_data <- data.frame(Target = character(172))
prepared_data$Target <- data$Amplicon_ID
prepared_data$Start_Dangling_End <- data$Start_Dangling_End
prepared_data$Stop_Dangling_End <- data$Stop_Dangling_End
prepared_data$Fwd_Primer_Len <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Fwd_Primer.), function(x) nchar(x)))
prepared_data$Rev_Primer_Len <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Rev_Primer.), function(x) nchar(x)))
prepared_data$Amplicon_Len <- as.numeric(lapply(as.character(data$Amplicon_Inner_Part), function(x) nchar(x)))
prepared_data$Fwd_END_3 <- as.character(lapply(data$Amplicon_Inner_Part, function(x) substr(x, nchar(x), nchar(x))))
prepared_data$Rev_END_3 <- as.character(lapply(data$Amplicon_Inner_Part, function(x) substr(x, 1, 1)))
countCharOccurrences <- function(char, s) {s2 <- gsub(char,"",s);return (nchar(s) - nchar(s2))}
countGCcontent <- function(x){ gg <- countCharOccurrences('G',x); cc <-countCharOccurrences('C',x); aa <- countCharOccurrences('A',x); tt <- countCharOccurrences('T',x); (cc+gg)*100/(aa+tt+cc+gg) }
prepared_data$Fwd_Primer_GC_content <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Fwd_Primer.), function(x) countGCcontent(x)))
prepared_data$Rev_Primer_GC_content <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Rev_Primer.), function(x) countGCcontent(x)))
prepared_data$Amplicon_GC_content <- as.numeric(lapply(as.character(data$Amplicon_Inner_Part), function(x) countGCcontent(x)))
rownames(prepared_data) <- prepared_data$Target
valid_names <- intersect(y$Target, data$Amplicon_ID)
prepared_data <- prepared_data[valid_names,]
prepared_data$Start_Dangling_End <- as.factor(prepared_data$Start_Dangling_End)
prepared_data$Stop_Dangling_End <- as.factor(prepared_data$Stop_Dangling_End)
prepared_data$Fwd_END_3 <- as.factor(prepared_data$Fwd_END_3)
prepared_data$Rev_END_3 <- as.factor(prepared_data$Rev_END_3)
y <- y[valid_names,]
ylog <- ylog[valid_names,]
totalTable <- totalTable[valid_names,]
totalTableOfMeans <- totalTableOfMeans[valid_names,]
totalTableOflogMeans <- totalTableOflogMeans[valid_names,]
Graphics of dependencies of result on different features
library(lattice)
xyplot(ylog$Means ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="log of amplicon percental amount", main="Amplicon GC-content")
xyplot(ylog$Means ~ prepared_data$Amplicon_Len, xlab="Length", ylab="log of amplicon percental amount", main="Amplicon length")
xyplot(ylog$Means ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="log of amplicon percental amount", main="Forward Primer GC-content")
xyplot(ylog$Means ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="log of amplicon percental amount", main="Forward Primer length")
xyplot(ylog$Means ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="log of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(ylog$Means ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="log of amplicon percental amount", main="Reverse Primer length")
xyplot(y$Means ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="amplicon percental amount", main="Amplicon GC-content")
xyplot(y$Means ~ prepared_data$Amplicon_Len, xlab="Length", ylab="amplicon percental amount", main="Amplicon length")
xyplot(y$Means ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="amplicon percental amount", main="Forward Primer GC-content")
xyplot(y$Means ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="amplicon percental amount", main="Forward Primer length")
xyplot(y$Means ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="amplicon percental amount", main="Reverse Primer GC-content")
xyplot(y$Means ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="amplicon percental amount", main="Reverse Primer length")
xyplot(totalTable$mean ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Amplicon GC-content")
xyplot(totalTable$mean ~ prepared_data$Amplicon_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Amplicon length")
xyplot(totalTable$mean ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Forward Primer GC-content")
xyplot(totalTable$mean ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Forward Primer length")
xyplot(totalTable$mean ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(totalTable$mean ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Reverse Primer length")
xyplot(totalTable$sd ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Amplicon GC-content")
xyplot(totalTable$sd ~ prepared_data$Amplicon_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Amplicon length")
xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Forward Primer GC-content")
xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Forward Primer length")
xyplot(totalTable$sd ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(totalTable$sd ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Reverse Primer length")
xyplot(log(totalTable$mean) ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Amplicon GC-content")
xyplot(log(totalTable$mean) ~ prepared_data$Amplicon_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Amplicon length")
xyplot(log(totalTable$mean) ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Forward Primer GC-content")
xyplot(log(totalTable$mean) ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Forward Primer length")
xyplot(log(totalTable$mean) ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(log(totalTable$mean) ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="mean of amplicon percental amount", main="Reverse Primer length")
xyplot(totalTable$sd ~ prepared_data$Amplicon_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Amplicon GC-content")
xyplot(totalTable$sd ~ prepared_data$Amplicon_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Amplicon length")
xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Forward Primer GC-content")
xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Forward Primer length")
xyplot(totalTable$sd ~ prepared_data$Rev_Primer_GC_content, xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Reverse Primer GC-content")
xyplot(totalTable$sd ~ prepared_data$Rev_Primer_Len, xlab="Length", ylab="sd of amplicon percental amount", main="Reverse Primer length")
histogram(~ylog$Means|factor(prepared_data$Start_Dangling_End))
Graphics of dependencies of standard variation of result
bwplot(prepared_data$Fwd_END_3 ~ ylog$Means, ylab="nucleotids", xlab="log of amplicon percental amount",
main="Forward primer 3' end")
bwplot(prepared_data$Rev_END_3 ~ ylog$Means, ylab="nucleotids", xlab="log of amplicon percental amount",
main="Reverse primer 3' end")
bwplot(prepared_data$Start_Dangling_End ~ ylog$Means, ylab="nucleotids", xlab="log of amplicon percental amount",
main="First dangling end")
bwplot(prepared_data$Stop_Dangling_End ~ ylog$Means, ylab="nucleotids", xlab="log of amplicon percental amount",
main="Second dangling end")
bwplot(prepared_data$Fwd_END_3 ~ y$Means, ylab="nucleotids", xlab="amplicon percental amount",
main="Forward primer 3' end")
bwplot(prepared_data$Rev_END_3 ~ y$Means, ylab="nucleotids", xlab="amplicon percental amount",
main="Reverse primer 3' end")
bwplot(prepared_data$Start_Dangling_End ~ y$Means, ylab="nucleotids", xlab="amplicon percental amount",
main="First dangling end")
bwplot(prepared_data$Stop_Dangling_End ~ y$Means, ylab="nucleotids", xlab="amplicon percental amount",
main="Second dangling end")
bwplot(prepared_data$Fwd_END_3 ~ totalTable$sd, ylab="nucleotids", xlab="standard deviation of amplicon percental amount",
main="Forward primer 3' end")
bwplot(prepared_data$Rev_END_3 ~ totalTable$sd, ylab="nucleotids", xlab="standard deviation of amplicon percental amount",
main="Reverse primer 3' end")
bwplot(prepared_data$Start_Dangling_End ~ totalTable$sd, ylab="nucleotids", xlab="standard deviation of amplicon percental amount",
main="First dangling end")
bwplot(prepared_data$Stop_Dangling_End ~ totalTable$sd, ylab="nucleotids", xlab="standard deviation of amplicon percental amount",
main="Second dangling end")
bwplot(prepared_data$Fwd_END_3 ~ totalTable$mean, ylab="nucleotids", xlab="mean of amplicon percental amount",
main="Forward primer 3' end")
bwplot(prepared_data$Rev_END_3 ~ totalTable$mean, ylab="nucleotids", xlab="mean of amplicon percental amount",
main="Reverse primer 3' end")
bwplot(prepared_data$Start_Dangling_End ~ totalTable$mean, ylab="nucleotids", xlab="mean of amplicon percental amount",
main="First dangling end")
bwplot(prepared_data$Stop_Dangling_End ~ totalTable$mean, ylab="nucleotids", xlab="mean of amplicon percental amount",
main="Second dangling end")
amplicon_len <- prepared_data$Amplicon_Len
q <- quantile(amplicon_len, probs = c(1/4, 1/2, 3/4, 1))
amplicon_len[amplicon_len <= q[1]] <- q[1]
amplicon_len[amplicon_len > q[1] & amplicon_len <= q[2]] <- q[2]
amplicon_len[amplicon_len > q[2] & amplicon_len <= q[3]] <- q[3]
amplicon_len[amplicon_len > q[3]] <- q[4]
histogram(amplicon_len)
bwplot(amplicon_len ~ y$Means, ylab="length", xlab="amplicon percental amount",
main="Amplicon length")
bwplot(amplicon_len ~ ylog$Means, ylab="length", xlab="log of amplicon percental amount",
main="Amplicon length")
bwplot(amplicon_len ~ totalTable$sd, ylab="length", xlab="standard deviation of amplicon percental amount",
main="Amplicon length")
bwplot(amplicon_len ~ totalTable$mean, ylab="length", xlab="mean of amplicon percental amount",
main="Amplicon length")
amplicon_gc <- prepared_data$Amplicon_GC_content
q <- quantile(amplicon_gc, probs = c(1/4, 1/2, 3/4, 1))
amplicon_gc[amplicon_gc <= q[1]] <- q[1]
amplicon_gc[amplicon_gc > q[1] & amplicon_gc <= q[2]] <- q[2]
amplicon_gc[amplicon_gc > q[2] & amplicon_gc <= q[3]] <- q[3]
amplicon_gc[amplicon_gc > q[3]] <- q[4]
histogram(amplicon_gc)
bwplot(amplicon_gc ~ y$Means, ylab="GC-content", xlab="amplicon percental amount",
main="Amplicon GC-content")
bwplot(amplicon_gc ~ ylog$Means, ylab="GC-content", xlab="log of amplicon percental amount",
main="Amplicon GC-content")
bwplot(amplicon_gc ~ totalTable$sd, ylab="GC-content", xlab="standard deviation of amplicon percental amount",
main="Amplicon GC-content")
bwplot(amplicon_gc ~ totalTable$mean, ylab="GC-content", xlab="mean of amplicon percental amount",
main="Amplicon GC-content")
primer_len <- prepared_data$Fwd_Primer_Len
q <- quantile(primer_len, probs = c(1/4, 1/2, 3/4, 1))
primer_len[primer_len <= q[1]] <- q[1]
primer_len[primer_len > q[1] & primer_len <= q[2]] <- q[2]
primer_len[primer_len > q[2] & primer_len <= q[3]] <- q[3]
primer_len[primer_len > q[3]] <- q[4]
histogram(primer_len)
bwplot(primer_len ~ y$Means, ylab="length", xlab="primer percental amount",
main="Forward Primer length")
bwplot(primer_len ~ ylog$Means, ylab="length", xlab="log of amplicon percental amount",
main="Forward Primer length")
bwplot(primer_len ~ totalTable$sd, ylab="length", xlab="standard deviation of amplicon percental amount",
main="Forward Primer length")
bwplot(primer_len ~ totalTable$mean, ylab="length", xlab="mean of amplicon percental amount",
main="Forward Primer length")
primer_gc <- prepared_data$Fwd_Primer_GC_content
q <- quantile(primer_gc, probs = c(1/4, 1/2, 3/4, 1))
primer_gc[primer_gc <= q[1]] <- q[1]
primer_gc[primer_gc > q[1] & primer_gc <= q[2]] <- q[2]
primer_gc[primer_gc > q[2] & primer_gc <= q[3]] <- q[3]
primer_gc[primer_gc > q[3]] <- q[4]
bwplot(primer_gc ~ y$Means, ylab="GC-content", xlab="primer percental amount",
main="Forward Primer GC-content")
bwplot(primer_gc ~ ylog$Means, ylab="GC-content", xlab="log of amplicon percental amount",
main="Forward Primer GC-content")
bwplot(primer_gc ~ totalTable$sd, ylab="GC-content", xlab="standard deviation of amplicon percental amount",
main="Forward Primer GC-content")
bwplot(primer_gc ~ totalTable$mean, ylab="GC-content", xlab="mean of amplicon percental amount",
main="Forward Primer GC-content")
primer_len <- prepared_data$Rev_Primer_Len
q <- quantile(primer_len, probs = c(1/4, 1/2, 3/4, 1))
primer_len[primer_len <= q[1]] <- q[1]
primer_len[primer_len > q[1] & primer_len <= q[2]] <- q[2]
primer_len[primer_len > q[2] & primer_len <= q[3]] <- q[3]
primer_len[primer_len > q[3]] <- q[4]
histogram(primer_len)
bwplot(primer_len ~ y$Means, ylab="length", xlab="primer percental amount",
main="Reverse Primer length")
bwplot(primer_len ~ ylog$Means, ylab="length", xlab="log of amplicon percental amount",
main="Reverse Primer length")
bwplot(primer_len ~ totalTable$sd, ylab="length", xlab="standard deviation of amplicon percental amount",
main="Reverse Primer length")
bwplot(primer_len ~ totalTable$mean, ylab="length", xlab="mean of amplicon percental amount",
main="Reverse Primer length")
primer_gc <- prepared_data$Rev_Primer_GC_content
q <- quantile(primer_gc, probs = c(1/4, 1/2, 3/4, 1))
primer_gc[primer_gc <= q[1]] <- q[1]
primer_gc[primer_gc > q[1] & primer_gc <= q[2]] <- q[2]
primer_gc[primer_gc > q[2] & primer_gc <= q[3]] <- q[3]
primer_gc[primer_gc > q[3]] <- q[4]
bwplot(primer_gc ~ y$Means, ylab="GC-content", xlab="primer percental amount",
main="Reverse Primer GC-content")
bwplot(primer_gc ~ ylog$Means, ylab="GC-content", xlab="log of amplicon percental amount",
main="Reverse Primer GC-content")
bwplot(primer_gc ~ totalTable$sd, ylab="GC-content", xlab="standard deviation of amplicon percental amount",
main="Reverse Primer GC-content")
bwplot(primer_gc ~ totalTable$mean, ylab="GC-content", xlab="mean of amplicon percental amount",
main="Reverse Primer GC-content")
prepared_data$res <- y$Means
prepared_data$logres <- ylog$Means
prepared_data$sd <- totalTable$sd
prepared_data$mean <- totalTable$mean
prepared_data$MeansSD <- totalTableOfMeans$sd
write.table(prepared_data, "../data/features.tsv", col.names = TRUE, sep = "\t", row.names = FALSE)
Use all features to train model
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
splitdf <- function(dataframe, n = 4, seed=NULL) {
if (!is.null(seed)) set.seed(seed)
index <- 1:nrow(dataframe)
trainindex <- sample(index, trunc(length(index)/n))
trainset <- dataframe[trainindex, ]
testset <- dataframe[-trainindex, ]
list(trainset=trainset,testset=testset)
}
splited <- splitdf(prepared_data)
trainSet <- splited$trainset
testSet <- splited$testset
formula <- res ~ Amplicon_Len + Amplicon_GC_content + Fwd_END_3 + Rev_END_3 +Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 1.222811e-04
## Amplicon_GC_content 8.022959e-05
## Fwd_END_3 4.439550e-05
## Rev_END_3 3.503290e-05
## Start_Dangling_End 6.432253e-05
## Stop_Dangling_End 5.175747e-05
## Fwd_Primer_Len 3.296618e-05
## Fwd_Primer_GC_content 3.411650e-05
## Rev_Primer_Len 6.342570e-05
## Rev_Primer_GC_content 4.776624e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse <- function(error)
{
sqrt(mean(error^2))
}
sd(testSet$res)
## [1] 0.004352934
rmse(predicted - testSet$res)
## [1] 0.005948915
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.00410349
Use only amplicon information and 3’ end to train model
formula <- mean ~ Amplicon_Len + Amplicon_GC_content + Fwd_END_3 + Rev_END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 8.263528e-05
## Amplicon_GC_content 8.958119e-05
## Fwd_END_3 4.622326e-05
## Rev_END_3 3.020851e-05
## Start_Dangling_End 2.560269e-05
## Stop_Dangling_End 2.761128e-05
## Fwd_Primer_Len 1.980689e-05
## Fwd_Primer_GC_content 2.579848e-05
## Rev_Primer_Len 5.707253e-05
## Rev_Primer_GC_content 5.210287e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$mean)
## [1] 0.005827737
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$mean)
## [1] 0.003820587
Use only amplicon information and 3’ end to train model
formula <- sd ~ Amplicon_Len + Amplicon_GC_content + Fwd_END_3 + Rev_END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 1.885031e-05
## Amplicon_GC_content 8.647637e-06
## Fwd_END_3 8.821251e-06
## Rev_END_3 1.336054e-05
## Start_Dangling_End 2.338848e-06
## Stop_Dangling_End 1.904682e-06
## Fwd_Primer_Len 5.407571e-06
## Fwd_Primer_GC_content 6.072814e-06
## Rev_Primer_Len 3.076977e-06
## Rev_Primer_GC_content 4.455249e-06
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$sd)
## [1] 0.002614662
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$sd)
## [1] 0.001934353
Use only amplicon information and 3’ end to train model
formula <- res ~ Amplicon_Len + Amplicon_GC_content + Fwd_END_3 + Rev_END_3
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 1.734435e-04
## Amplicon_GC_content 1.545726e-04
## Fwd_END_3 9.905847e-05
## Rev_END_3 7.401096e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.003957646
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.003810592
Use only amplicon information to train model
formula <- res ~ Amplicon_Len + Amplicon_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Amplicon_Len 0.0002806780
## Amplicon_GC_content 0.0002711733
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
mean(testSet$res)
## [1] 0.007852893
rmse(predicted - testSet$res)
## [1] 0.003824522
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.003644806
Use other to train model
formula <- res ~ Fwd_END_3 + Rev_END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
## IncNodePurity
## Fwd_END_3 7.019199e-05
## Rev_END_3 4.869826e-05
## Start_Dangling_End 7.722658e-05
## Stop_Dangling_End 6.943201e-05
## Fwd_Primer_Len 5.601442e-05
## Fwd_Primer_GC_content 5.896016e-05
## Rev_Primer_Len 8.590083e-05
## Rev_Primer_GC_content 9.094960e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.006385111
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.004877023
Different feature selection tests
library(FSelector)
formula <- res ~ Amplicon_Len + Amplicon_GC_content + Fwd_END_3 + Rev_END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.4927980
## Amplicon_GC_content 0.4013567
## Fwd_END_3 0.1636325
## Rev_END_3 0.1544271
## Start_Dangling_End 0.2292457
## Stop_Dangling_End 0.1955937
## Fwd_Primer_Len 0.0000000
## Fwd_Primer_GC_content 0.0000000
## Rev_Primer_Len 0.0000000
## Rev_Primer_GC_content 0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance)
information.gain(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.20481515
## Amplicon_GC_content 0.20752242
## Fwd_END_3 0.05912807
## Rev_END_3 0.05150039
## Start_Dangling_End 0.11438548
## Stop_Dangling_End 0.09951913
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
gain.ratio(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.20575099
## Amplicon_GC_content 0.35674389
## Fwd_END_3 0.03013050
## Rev_END_3 0.02683712
## Start_Dangling_End 0.06257012
## Stop_Dangling_End 0.05606093
## Fwd_Primer_Len NaN
## Fwd_Primer_GC_content NaN
## Rev_Primer_Len NaN
## Rev_Primer_GC_content NaN
symmetrical.uncertainty(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.12348679
## Amplicon_GC_content 0.14294834
## Fwd_END_3 0.02760319
## Rev_END_3 0.02428836
## Start_Dangling_End 0.05512733
## Stop_Dangling_End 0.04858213
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
oneR(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 2.3739993
## Amplicon_GC_content 2.6550875
## Fwd_END_3 0.7936508
## Rev_END_3 0.8669224
## Start_Dangling_End 0.9641420
## Stop_Dangling_End 0.9178561
## Fwd_Primer_Len 2.9170625
## Fwd_Primer_GC_content 2.9170625
## Rev_Primer_Len 2.9170625
## Rev_Primer_GC_content 2.9170625
random.forest.importance(formula, data = prepared_data, importance.type = 1)
## attr_importance
## Amplicon_Len 29.6490533
## Amplicon_GC_content 29.7308283
## Fwd_END_3 -4.4841139
## Rev_END_3 2.0121176
## Start_Dangling_End -0.3056570
## Stop_Dangling_End -4.5728395
## Fwd_Primer_Len 4.9462923
## Fwd_Primer_GC_content 5.9103871
## Rev_Primer_Len 0.4344225
## Rev_Primer_GC_content 1.2793810
Different feature selection tests
library(FSelector)
formula <- MeansSD ~ Amplicon_Len + Amplicon_GC_content + Fwd_END_3 + Rev_END_3 +Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.0000000
## Amplicon_GC_content 0.5266193
## Fwd_END_3 0.1729355
## Rev_END_3 0.1839802
## Start_Dangling_End 0.1274934
## Stop_Dangling_End 0.1298428
## Fwd_Primer_Len 0.0000000
## Fwd_Primer_GC_content 0.0000000
## Rev_Primer_Len 0.0000000
## Rev_Primer_GC_content 0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance, main = "Chi-squared test", xlab = "Feature improtance")
information.gain(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.00000000
## Amplicon_GC_content 0.18033461
## Fwd_END_3 0.06875444
## Rev_END_3 0.07664721
## Start_Dangling_End 0.03639421
## Stop_Dangling_End 0.03419287
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
gain.ratio(formula, data = prepared_data)
## attr_importance
## Amplicon_Len NaN
## Amplicon_GC_content 0.39746114
## Fwd_END_3 0.03503591
## Rev_END_3 0.03994126
## Start_Dangling_End 0.01990803
## Stop_Dangling_End 0.01926146
## Fwd_Primer_Len NaN
## Fwd_Primer_GC_content NaN
## Rev_Primer_Len NaN
## Rev_Primer_GC_content NaN
symmetrical.uncertainty(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.00000000
## Amplicon_GC_content 0.12994916
## Fwd_END_3 0.03209714
## Rev_END_3 0.03614798
## Start_Dangling_End 0.01753995
## Stop_Dangling_End 0.01669189
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
oneR(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 2.9535005
## Amplicon_GC_content 2.7997829
## Fwd_END_3 0.8917744
## Rev_END_3 0.9302512
## Start_Dangling_End 0.7936508
## Stop_Dangling_End 0.9320720
## Fwd_Primer_Len 2.9535005
## Fwd_Primer_GC_content 2.9535005
## Rev_Primer_Len 2.9535005
## Rev_Primer_GC_content 2.9535005
random.forest.importance(formula, data = prepared_data, importance.type = 1)
## attr_importance
## Amplicon_Len 6.2581151
## Amplicon_GC_content 9.5620202
## Fwd_END_3 -6.2590122
## Rev_END_3 0.8015642
## Start_Dangling_End -0.1979996
## Stop_Dangling_End -1.7647892
## Fwd_Primer_Len 3.2151594
## Fwd_Primer_GC_content 3.9072300
## Rev_Primer_Len 2.6484628
## Rev_Primer_GC_content -0.7854635
Different feature selection tests
library(FSelector)
histogram(prepared_data$sd)
sd(prepared_data$sd)
## [1] 0.001848055
formula <- sd ~ Amplicon_Len + Amplicon_GC_content + Fwd_END_3 + Rev_END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.0000000
## Amplicon_GC_content 0.0000000
## Fwd_END_3 0.1441501
## Rev_END_3 0.2144925
## Start_Dangling_End 0.1902519
## Stop_Dangling_End 0.1559919
## Fwd_Primer_Len 0.0000000
## Fwd_Primer_GC_content 0.0000000
## Rev_Primer_Len 0.0000000
## Rev_Primer_GC_content 0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance, main = "Chi-squared test", xlab = "Feature improtance")
information.gain(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.00000000
## Amplicon_GC_content 0.00000000
## Fwd_END_3 0.04685588
## Rev_END_3 0.10385930
## Start_Dangling_End 0.09190803
## Stop_Dangling_End 0.05322211
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
gain.ratio(formula, data = prepared_data)
## attr_importance
## Amplicon_Len NaN
## Amplicon_GC_content NaN
## Fwd_END_3 0.02387683
## Rev_END_3 0.05412162
## Start_Dangling_End 0.05027470
## Stop_Dangling_End 0.02998098
## Fwd_Primer_Len NaN
## Fwd_Primer_GC_content NaN
## Rev_Primer_Len NaN
## Rev_Primer_GC_content NaN
symmetrical.uncertainty(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.00000000
## Amplicon_GC_content 0.00000000
## Fwd_END_3 0.02187407
## Rev_END_3 0.04898162
## Start_Dangling_End 0.04429447
## Stop_Dangling_End 0.02598137
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
oneR(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 2.8965548
## Amplicon_GC_content 2.8965548
## Fwd_END_3 0.7936508
## Rev_END_3 0.9733988
## Start_Dangling_End 0.9049111
## Stop_Dangling_End 0.8319027
## Fwd_Primer_Len 2.8965548
## Fwd_Primer_GC_content 2.8965548
## Rev_Primer_Len 2.8965548
## Rev_Primer_GC_content 2.8965548
random.forest.importance(formula, data = prepared_data, importance.type = 1)
## attr_importance
## Amplicon_Len 4.7710543
## Amplicon_GC_content 3.6755066
## Fwd_END_3 -3.5998294
## Rev_END_3 -2.5855665
## Start_Dangling_End 4.3668671
## Stop_Dangling_End -5.0886626
## Fwd_Primer_Len -1.6828855
## Fwd_Primer_GC_content 0.6827187
## Rev_Primer_Len 8.7946139
## Rev_Primer_GC_content 12.8004917
library(FSelector)
histogram(prepared_data$mean)
sd(prepared_data$mean)
## [1] 0.003951656
formula <- mean ~ Amplicon_Len + Amplicon_GC_content + Fwd_END_3 + Rev_END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.5359976
## Amplicon_GC_content 0.5601954
## Fwd_END_3 0.2088283
## Rev_END_3 0.1598876
## Start_Dangling_End 0.1638039
## Stop_Dangling_End 0.2313565
## Fwd_Primer_Len 0.0000000
## Fwd_Primer_GC_content 0.0000000
## Rev_Primer_Len 0.0000000
## Rev_Primer_GC_content 0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance, main = "Chi-squared test", xlab = "Feature improtance")
information.gain(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.23081264
## Amplicon_GC_content 0.20283463
## Fwd_END_3 0.09499156
## Rev_END_3 0.07328269
## Start_Dangling_End 0.06088759
## Stop_Dangling_End 0.12678523
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
gain.ratio(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.23510367
## Amplicon_GC_content 0.42347307
## Fwd_END_3 0.04840583
## Rev_END_3 0.03818800
## Start_Dangling_End 0.03330618
## Stop_Dangling_End 0.07142041
## Fwd_Primer_Len NaN
## Fwd_Primer_GC_content NaN
## Rev_Primer_Len NaN
## Rev_Primer_GC_content NaN
symmetrical.uncertainty(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 0.13973841
## Amplicon_GC_content 0.14484429
## Fwd_END_3 0.04434561
## Rev_END_3 0.03456123
## Start_Dangling_End 0.02934438
## Stop_Dangling_End 0.06189259
## Fwd_Primer_Len 0.00000000
## Fwd_Primer_GC_content 0.00000000
## Rev_Primer_Len 0.00000000
## Rev_Primer_GC_content 0.00000000
oneR(formula, data = prepared_data)
## attr_importance
## Amplicon_Len 2.4839783
## Amplicon_GC_content 2.8248885
## Fwd_END_3 0.9444674
## Rev_END_3 0.7936508
## Start_Dangling_End 0.8631728
## Stop_Dangling_End 1.0921138
## Fwd_Primer_Len 2.9549953
## Fwd_Primer_GC_content 2.9549953
## Rev_Primer_Len 2.9549953
## Rev_Primer_GC_content 2.9549953
random.forest.importance(formula, data = prepared_data, importance.type = 1)
## attr_importance
## Amplicon_Len 14.1091624
## Amplicon_GC_content 27.5501825
## Fwd_END_3 -3.8936257
## Rev_END_3 0.4302335
## Start_Dangling_End -0.9945405
## Stop_Dangling_End -2.2462744
## Fwd_Primer_Len 1.4025869
## Fwd_Primer_GC_content 2.7979556
## Rev_Primer_Len 5.2877999
## Rev_Primer_GC_content -1.7798436
Feature subset selection tests
library(corrplot)
prepared_data_num <- prepared_data
prepared_data_num$Start_Dangling_End <- as.numeric(prepared_data_num$Start_Dangling_End)
prepared_data_num$Stop_Dangling_End <- as.numeric(prepared_data_num$Stop_Dangling_End)
prepared_data_num$Fwd_END_3 <- as.numeric(prepared_data_num$Fwd_END_3)
prepared_data_num$Rev_END_3 <- as.numeric(prepared_data_num$Rev_END_3)
prepared_data_num$res <- NULL
prepared_data_num$logres <- NULL
prepared_data_num$MeansSD <- NULL
prepared_data.scale<- scale(prepared_data_num[2:ncol(prepared_data_num)], center=TRUE, scale=TRUE)
cor_prepared_data <- cor(prepared_data.scale)
corrplot(cor_prepared_data, order = "original")
library(cvTools)
## Loading required package: robustbase
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:httr':
##
## progress
# control <- trainControl(method="repeatedcv", number=10, repeats=3)
# model <- train(res~., data=prepared_data, method="lvq", preProcess="scale", trControl=control)
# # estimate variable importance
# importance <- varImp(model, scale=FALSE)
# # summarize importance
# print(importance)
# # plot importance
# plot(importance)
evaluator <- function(subset) {
formula <- mean ~ .
valid_names <- c(subset,"res")
rf <- randomForest(formula, prepared_data[,valid_names])
result <- cvFit(rf, data = prepared_data[,valid_names], y = prepared_data[,valid_names]$res, cost = rtmspe,
K = 5, R = 10, costArgs = list(trim = 0.1), seed = 1234)
return(1-result$cv)
}
#perform the best subset search
#subset <- exhaustive.search(names(prepared_data[-c(1, 11)]), evaluator)
#subset <- best.first.search(names(prepared_data[-c(1, 11)]), evaluator)